{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4f979bc6",
   "metadata": {},
   "source": [
    "### Duplications leading to tx readthrough filter\n",
    "\n",
    "Criteria: \n",
    "\n",
    "* Duplication overlaps entire misexpressed gene \n",
    "* Duplication partially overlaps 5' end of an expressed gene (median TPM > 0.5)\n",
    "* Misexpressed and overlapping gene on the same strand \n",
    "* No active gene located in the expected readthrough region on same strand \n",
    "* Misexpressed gene upstream of transcribed gene "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "deb31da1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "import pysam\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c59d2633",
   "metadata": {},
   "outputs": [],
   "source": [
    "# working directory \n",
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "# input files \n",
    "misexp_vrnt_feat_path = wkdir_path.joinpath(\"6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv\")\n",
    "tpm_mtx_path = wkdir_path.joinpath(\"1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv\")\n",
    "gencode_bed_path = wkdir_path.joinpath(\"3_misexp_genes/bed_files/all_genes.bed\")\n",
    "# output\n",
    "out_dir = wkdir_path.joinpath(\"6_misexp_dissect/tx_readthrough/duplications/\")\n",
    "out_dir_path = Path(out_dir)\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)\n",
    "# constants \n",
    "tpm_cutoff = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "39b26601",
   "metadata": {},
   "outputs": [],
   "source": [
    "# only include overlapping genes that are transcribed (median TPM > 0.5)\n",
    "# load gene expression matrix \n",
    "tpm_mtx_df = pd.read_csv(tpm_mtx_path)\n",
    "# add median expression of gene across INTERVAL \n",
    "median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index(\"gene_id\").median(axis=1))\n",
    "median_tpm_df = median_tpm_df.rename(columns={0:\"median_tpm\"}).reset_index()\n",
    "genes_median_tpm05 = median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "47b337dc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load GENCODE genes bed file \n",
    "gencode_bed = BedTool(gencode_bed_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "36227af5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of misexpression-associated duplications: 16\n"
     ]
    }
   ],
   "source": [
    "misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep=\"\\t\")\n",
    "misexp_dup_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == \"DUP\"]\n",
    "misexp_gene_cols = {\"gene_id\": \"misexp_gene_id\", \"gene_start\": \"misexp_gene_start\", \n",
    "                    \"gene_end\": \"misexp_gene_end\", \"gene_strand\": \"misexp_gene_strand\"}\n",
    "misexp_dup_feat_df = misexp_dup_feat_df.rename(columns=misexp_gene_cols)\n",
    "\n",
    "misexp_dups = misexp_dup_feat_df.vrnt_id.unique()\n",
    "print(f\"Number of misexpression-associated duplications: {len(misexp_dups)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "298bcc73",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of variants overlapping the entire gene: 5\n"
     ]
    }
   ],
   "source": [
    "# select DUPs overlapping the entire misexpressed gene \n",
    "dup_overlap_entire_gene_df = misexp_dup_feat_df[misexp_dup_feat_df.position == \"Entire gene\"]\n",
    "dup_overlap_entire_gene = dup_overlap_entire_gene_df.vrnt_id.unique()\n",
    "print(f\"Number of variants overlapping the entire gene: {len(dup_overlap_entire_gene)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "674b5ee6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to required variant-level features \n",
    "vrnt_features_cols = [\"vrnt_id\", \"misexp_gene_id\", \"chrom\", \"sv_start\", \"sv_end\", \"misexp_gene_start\", \"misexp_gene_end\", \"misexp_gene_strand\"]\n",
    "dup_entire_gene_overlap_trunc_df = dup_overlap_entire_gene_df[vrnt_features_cols].drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "501cdb2a",
   "metadata": {},
   "source": [
    "**Select DUPs that overlap 5' end of a gene**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5ca269de",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load variants in tested windows \n",
    "vrnt_id_in_window_chr_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "vrnt_id_in_window_chr_bed = BedTool(vrnt_id_in_window_chr_path)\n",
    "# intersect SVs and genes \n",
    "sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))\n",
    "# read intersect between SVs and genes \n",
    "columns={0: \"sv_chrom\", 1: \"sv_start\", 2: \"sv_end\", 3:\"vrnt_id\", \n",
    "         4:\"gene_chrom\", 5:\"overlap_gene_start\", 6:\"overlap_gene_end\", 7:\"overlap_gene_id\", \n",
    "         8:\"score\", 9:\"overlap_gene_strand\", 10:\"gene_overlap\"\n",
    "        }\n",
    "sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=columns)\n",
    "# subset to relevant features \n",
    "sv_overlap_gene_feat_df = sv_overlap_gene_df[[\"vrnt_id\", \"overlap_gene_start\", \"overlap_gene_end\", \"overlap_gene_id\", \"overlap_gene_strand\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4e3d4510",
   "metadata": {},
   "outputs": [],
   "source": [
    "def overlaps_gene_5_prime(row): \n",
    "    if row[\"overlap_gene_strand\"] == \"+\": \n",
    "        return (row[\"sv_start\"] < row[\"overlap_gene_start\"]) & (row[\"sv_end\"] > row[\"overlap_gene_start\"]) & (row[\"sv_end\"] < row[\"overlap_gene_end\"])\n",
    "    elif row[\"overlap_gene_strand\"] == \"-\": \n",
    "        return (row[\"sv_start\"] > row[\"overlap_gene_start\"]) & (row[\"overlap_gene_end\"] > row[\"sv_start\"]) & (row[\"overlap_gene_end\"] < row[\"sv_end\"])\n",
    "    else: \n",
    "        return np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "41b5e816",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add overlapping genes to misexpression DELs that are upstream \n",
    "misexp_dup_overlap_genes_df = pd.merge(dup_entire_gene_overlap_trunc_df, \n",
    "                                       sv_overlap_gene_feat_df, \n",
    "                                       on=\"vrnt_id\", \n",
    "                                       how=\"left\")\n",
    "# annotate genes that overlap the 3' end \n",
    "misexp_dup_overlap_genes_df[\"overlap_gene_5_prime\"] = misexp_dup_overlap_genes_df.apply(overlaps_gene_5_prime, axis=1)\n",
    "# variant overlaps gene that is expressed in blood \n",
    "misexp_dup_overlap_genes_df[\"overlap_gene_expressed\"] = np.where(misexp_dup_overlap_genes_df.overlap_gene_id.isin(genes_median_tpm05), True, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "668d41b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "### check for active genes in readthrough region \n",
    "# maybe want to adjust this to require entire gene overlap?\n",
    "region_gene_cols = {0:\"sv_chrom\", 1:\"sv_start\", 2:\"sv_end\", 3:\"name\", 4:\"score\", 5:\"strand\",  \n",
    "                    6: \"gene_chrom\", 7:\"gene_start\", 8:\"gene_end\", 9:\"gene_id\", 10:\"gene_score\", 11:\"gene_strand\", \n",
    "                    12:\"overlap\"\n",
    "                   }\n",
    "\n",
    "def no_active_intervening_gene(row):\n",
    "    \"\"\" \n",
    "    Check if there is an active gene on the same strand as the misexpressed gene in the upstream \n",
    "    region between the SV and the misexpressed gene. Input must be subset to upstream SVs for \n",
    "    valid .bed file coordinates. \n",
    "    \n",
    "    Returns True if no active intervening gene and False if there is one. \n",
    "    \"\"\"\n",
    "    chrom = row[\"chrom\"]\n",
    "    gene_strand = row[\"misexp_gene_strand\"]\n",
    "    vrnt_gene_id = f'{row[\"vrnt_id\"]}_{row[\"misexp_gene_id\"]}'\n",
    "    if row[\"misexp_gene_strand\"] == \"+\":\n",
    "        region_start = row[\"sv_start\"]\n",
    "        region_end = row[\"misexp_gene_start\"]\n",
    "    elif row[\"misexp_gene_strand\"] == \"-\":\n",
    "        region_start = row[\"misexp_gene_end\"]\n",
    "        region_end = row[\"sv_end\"]   \n",
    "    else: \n",
    "        raise ValueError(\"Strand not recognised.\")\n",
    "    # create .bed file for region and intersect with gene bed file \n",
    "    region_bed = BedTool(f\"{chrom} {region_start} {region_end} {vrnt_gene_id} 0 {gene_strand}\", from_string=True)\n",
    "    # only report genes that are 100% contained within the readthrough region \n",
    "    region_gene_intersect_str = StringIO(str(region_bed.intersect(gencode_bed, wo=True, F=1)))\n",
    "    # check if string is empty - if empty no intervening genes return False \n",
    "    # if not empty check if intervening gens are active \n",
    "    if region_gene_intersect_str.getvalue().strip():\n",
    "        region_gene_intersect_str.seek(0) # move cursor to beginning\n",
    "        region_gene_intersect_df = pd.read_csv(region_gene_intersect_str, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=region_gene_cols)\n",
    "        # N.B. subset to genes on the same strand as the misexpressed gene \n",
    "        region_gene_intersect_shared_strand_df = region_gene_intersect_df[region_gene_intersect_df.gene_strand == gene_strand]\n",
    "        genes_in_region = set(region_gene_intersect_shared_strand_df.gene_id.unique())\n",
    "        if len(genes_in_region.intersection(genes_median_tpm05)) > 0: \n",
    "            return False\n",
    "        else: \n",
    "            return True\n",
    "    else: \n",
    "        return True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "98222563",
   "metadata": {},
   "outputs": [],
   "source": [
    "misexp_dup_overlap_genes_df[\"no_active_gene\"] = misexp_dup_overlap_genes_df.apply(no_active_intervening_gene, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6c130f85",
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to DUPs that have overlap gene 5' end, \n",
    "# downstream gene expressed in whole blood\n",
    "# on same strand as the misexpressed gene\n",
    "# no intervening active genes \n",
    "misexp_dup_candidates_df = misexp_dup_overlap_genes_df[(misexp_dup_overlap_genes_df.overlap_gene_5_prime) &\n",
    "                                                      (misexp_dup_overlap_genes_df.overlap_gene_expressed) &\n",
    "                                                      (misexp_dup_overlap_genes_df.misexp_gene_strand == misexp_dup_overlap_genes_df.overlap_gene_strand) &\n",
    "                                                      (misexp_dup_overlap_genes_df.no_active_gene)\n",
    "                                                     ].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "bb9c6e27",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of DUP tx readthrough candidates: 5\n",
      "DUP tx readthrough candidates: ['408686' '425231' '397101' '414685' '414879']\n"
     ]
    }
   ],
   "source": [
    "misexp_tx_readthrough_dups = misexp_dup_candidates_df.vrnt_id.unique()\n",
    "print(f\"Number of DUP tx readthrough candidates: {len(misexp_tx_readthrough_dups)}\")\n",
    "print(f\"DUP tx readthrough candidates: {misexp_tx_readthrough_dups}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2303eeb6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# select closest gene \n",
    "def gene_distance(row): \n",
    "    \"\"\"\"\"\"\n",
    "    if row[\"misexp_gene_strand\"] == \"+\": \n",
    "        return row[\"misexp_gene_start\"] - row[\"sv_start\"]\n",
    "    elif row[\"misexp_gene_strand\"] == \"-\":\n",
    "        return row[\"sv_end\"] - row[\"misexp_gene_end\"]\n",
    "    else:\n",
    "        raise ValueError(\"Strand not recognised.\")\n",
    "        \n",
    "    \n",
    "misexp_dup_candidates_df[\"distance\"] = misexp_dup_candidates_df.apply(gene_distance, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1f92200f",
   "metadata": {},
   "outputs": [],
   "source": [
    "idx_closest_distance = misexp_dup_candidates_df.groupby(\"vrnt_id\").distance.idxmin()\n",
    "misexp_dup_candidates_closest_df = misexp_dup_candidates_df.loc[idx_closest_distance].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ddc49d18",
   "metadata": {},
   "source": [
    "**Metrics of transcriptional readthrough candidate DELs**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "170132fb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of transcriptional readthrough candidate variant-gene pairs: 5\n"
     ]
    }
   ],
   "source": [
    "# merge with misexpression variant features for metrics \n",
    "misexp_tx_read_vrnt_gene_df = misexp_dup_candidates_closest_df[[\"vrnt_id\", \"misexp_gene_id\", \"overlap_gene_id\", \"distance\"]].drop_duplicates()\n",
    "misexp_tx_read_vrnt_gene_df = misexp_tx_read_vrnt_gene_df.rename(columns={\"misexp_gene_id\": \"gene_id\"})\n",
    "print(f\"Number of transcriptional readthrough candidate variant-gene pairs: {len(misexp_tx_read_vrnt_gene_df)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "b2d6c7bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# add variant features \n",
    "misexp_tx_read_vrnt_feat_df = pd.merge(misexp_tx_read_vrnt_gene_df, \n",
    "                                       misexp_vrnt_feat_df, \n",
    "                                       on=[\"vrnt_id\", \"gene_id\"], \n",
    "                                       how=\"inner\")\n",
    "# write to file \n",
    "misexp_tx_read_vrnt_feat_path = out_dir_path.joinpath(\"misexp_dup_tx_readthrough_candidates.tsv\")\n",
    "misexp_tx_read_vrnt_feat_df.to_csv(misexp_tx_read_vrnt_feat_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7beba41a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write list of variants to file \n",
    "misexp_tx_read_vrnts = misexp_dup_candidates_closest_df.vrnt_id.unique()\n",
    "misexp_tx_read_vrnts_path = out_dir_path.joinpath(\"misexp_dup_tx_vrnts.txt\") \n",
    "with open(misexp_tx_read_vrnts_path, 'w') as f_out: \n",
    "    for vrnt_id in misexp_tx_read_vrnts: \n",
    "        f_out.write(f\"{vrnt_id}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4653bd92",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
